library(dplyr)
library(tidyr)
library(ggplot2)

theme_set(theme_minimal())

TODOs

Data

source: https://www.dwd.de/DE/leistungen/klimadatendeutschland/klarchivtagmonat.html

raw_hist <- read.delim('data/produkt_klima_tag_19500101_20221231_00403.txt', sep = ';')
head(raw_hist)
##   STATIONS_ID MESS_DATUM QN_3   FX   FM QN_4  RSK RSKF  SDK SHK_TAG  NM VPM
## 1         403   19500101 -999 -999 -999    5  2.2    7 -999       0 5.0 4.0
## 2         403   19500102 -999 -999 -999    5 12.6    8 -999       0 8.0 6.1
## 3         403   19500103 -999 -999 -999    5  0.5    1 -999       0 5.0 6.5
## 4         403   19500104 -999 -999 -999    5  0.5    7 -999       0 7.7 5.2
## 5         403   19500105 -999 -999 -999    5 10.3    7 -999       0 8.0 4.0
## 6         403   19500106 -999 -999 -999    5  7.2    8 -999      12 7.3 5.6
##       PM  TMK UPM  TXK  TNK  TGK eor
## 1 1025.6 -3.2  83 -1.1 -4.9 -6.3 eor
## 2 1005.6  1.0  95  2.2 -3.7 -5.3 eor
## 3  996.6  2.8  86  3.9  1.7 -1.4 eor
## 4  999.5 -0.1  85  2.1 -0.9 -2.3 eor
## 5 1001.1 -2.8  79 -0.9 -3.3 -5.2 eor
## 6  997.5  2.6  79  5.0 -4.0 -4.0 eor
raw_pres <- read.delim('data/produkt_klima_tag_20221107_20240509_00403.txt', sep = ';')
head(raw_pres)
##   STATIONS_ID MESS_DATUM QN_3   FX   FM QN_4 RSK RSKF SDK SHK_TAG  NM  VPM
## 1         403   20221107 -999 -999 -999   10 0.0    6 4.5       0 6.2  9.6
## 2         403   20221108 -999 -999 -999   10 0.2    6 7.5       0 6.0 10.4
## 3         403   20221109 -999 -999 -999   10 1.0    6 3.7       0 6.6 11.4
## 4         403   20221110 -999 -999 -999   10 0.0    0 6.1       0 5.1 10.2
## 5         403   20221111 -999 -999 -999   10 0.0    0 1.9       0 6.3  9.6
## 6         403   20221112 -999 -999 -999   10 0.0    0 7.3       0 4.0  8.8
##       PM  TMK UPM  TXK TNK  TGK eor
## 1 1002.9 10.7  75 15.0 6.4  5.1 eor
## 2 1002.7 12.1  75 16.9 7.9  4.2 eor
## 3 1001.5 11.8  83 15.0 9.0  5.1 eor
## 4 1012.6 11.7  74 14.3 8.6  5.8 eor
## 5 1020.1  8.6  87 12.8 4.0  0.6 eor
## 6 1022.8  6.4  92 13.8 1.8 -0.9 eor
meas <- bind_rows(raw_hist, raw_pres) |>
    select(date = MESS_DATUM, temp = TMK) |>
    mutate(date = as.POSIXct(strptime(date, "%Y%m%d")),
           year = as.integer(as.numeric(format(date, "%Y"))),
           day = as.integer(as.numeric(format(date, "%j")))) |>
    distinct(date, .keep_all = TRUE)
rm(raw_hist, raw_pres)
stopifnot(all(count(meas, date)$n == 1))
head(meas)
##         date temp year day
## 1 1950-01-01 -3.2 1950   1
## 2 1950-01-02  1.0 1950   2
## 3 1950-01-03  2.8 1950   3
## 4 1950-01-04 -0.1 1950   4
## 5 1950-01-05 -2.8 1950   5
## 6 1950-01-06  2.6 1950   6
ggplot(meas, aes(date, temp)) +
    geom_line() +
    geom_smooth(method = "gam")

filter(meas, year >= 2018) |>
    ggplot(aes(date, temp)) +
        geom_line()

filter(meas, year == 2023) |>
    ggplot(aes(date, temp)) +
        geom_line() +
        geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(meas, aes(day, temp, color = year)) +
    geom_point(alpha = 0.25) +
    scale_color_binned(type = 'viridis')

m1 <- lm(temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m1)
## 
## Call:
## lm(formula = temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366), 
##     data = meas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3493  -2.5635  -0.0229   2.5823  14.0507 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            9.41121    0.02290  410.93   <2e-16 ***
## cos(2 * pi * day/366) -9.00711    0.03244 -277.66   <2e-16 ***
## sin(2 * pi * day/366) -2.55724    0.03234  -79.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.774 on 27155 degrees of freedom
## Multiple R-squared:  0.7544, Adjusted R-squared:  0.7544 
## F-statistic: 4.17e+04 on 2 and 27155 DF,  p-value: < 2.2e-16
plot(m1)

meas_fit <- cbind(meas, pred = fitted(m1))

filter(meas_fit, year >= 2018) |>
    ggplot() +
        geom_line(aes(date, temp), alpha = 0.25) +
        geom_line(aes(date, pred), color = 'red')

ggplot(meas_fit) +
    geom_line(aes(date, temp), alpha = 0.25) +
    geom_line(aes(date, pred), color = 'red')

filter(meas_fit, year %in% (1950 + 0:6 * 10)) |>
    ggplot(aes(day, temp, color = year)) +
        geom_point(alpha = 0.25) +
        geom_line(aes(day, pred), color = 'red') +
        scale_color_binned(type = 'viridis')

m2 <- lm(temp ~ year + cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m2)
## 
## Call:
## lm(formula = temp ~ year + cos(2 * pi * day/366) + sin(2 * pi * 
##     day/366), data = meas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0982  -2.5367  -0.0541   2.5753  13.0459 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -47.296595   2.091820  -22.61   <2e-16 ***
## year                    0.028544   0.001053   27.11   <2e-16 ***
## cos(2 * pi * day/366)  -9.010653   0.032010 -281.50   <2e-16 ***
## sin(2 * pi * day/366)  -2.564623   0.031911  -80.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.724 on 27154 degrees of freedom
## Multiple R-squared:  0.7609, Adjusted R-squared:  0.7608 
## F-statistic: 2.88e+04 on 3 and 27154 DF,  p-value: < 2.2e-16
plot(m2)

meas_fit2 <- cbind(meas, pred = fitted(m2))

filter(meas_fit2, year >= 2018) |>
    ggplot() +
        geom_line(aes(date, temp), alpha = 0.25) +
        geom_line(aes(date, pred), color = 'red')

ggplot(meas_fit2) +
    geom_line(aes(date, temp), alpha = 0.25) +
    geom_line(aes(date, pred), color = 'red')

filter(meas_fit2, year %in% (1950 + 0:6 * 10)) |>
    ggplot(aes(day, temp, color = year)) +
        geom_point(alpha = 0.25) +
        geom_line(aes(day, pred, color = year)) +
        scale_color_binned(type = 'viridis')

resid <- meas_fit2$temp - meas_fit2$pred
ggplot(data.frame(resid = resid), aes(resid)) +
    geom_histogram(bins = 20)

quantile(resid, c(0.05, 0.95))
##        5%       95% 
## -5.907213  6.098858
resid_stats <- group_by(meas_fit2, year) |>
    summarise(me = mean(temp - pred),
              mae = mean(abs(temp - pred)),
              prop_days_warmer = mean(temp > pred + 6),
              prop_days_colder = mean(temp < pred - 6))
              #rmse = sqrt(mean((temp - pred)^2)))
resid_stats
## # A tibble: 75 × 5
##     year       me   mae prop_days_warmer prop_days_colder
##    <int>    <dbl> <dbl>            <dbl>            <dbl>
##  1  1950  0.942    2.81           0.0932           0.0329
##  2  1951  1.26     2.83           0.0493           0     
##  3  1952  0.00242  2.81           0.0492           0.0246
##  4  1953  1.56     3.04           0.112            0.0137
##  5  1954 -0.258    3.05           0.0493           0.0658
##  6  1955 -0.321    2.96           0.0384           0.0603
##  7  1956 -0.977    3.48           0.0328           0.104 
##  8  1957  0.677    3.03           0.0904           0.0301
##  9  1958  0.169    2.48           0.0219           0.0164
## 10  1959  1.04     2.85           0.0822           0.0164
## # ℹ 65 more rows
resid_stats_plt <- pivot_longer(resid_stats, !year, names_to = "measure")

filter(resid_stats_plt, measure %in% c("mae", "me")) |>
    ggplot(aes(x = year, y = value, fill = measure)) +
        geom_col(position = position_dodge())

filter(resid_stats_plt, measure %in% c("prop_days_warmer", "prop_days_colder")) |>
    ggplot(aes(x = year, y = value, fill = measure)) +
        geom_col(position = position_stack()) +
        scale_fill_discrete(limits = rev)

resid_stats_ordered <- filter(resid_stats, year < 2024) |>
    arrange(mae)
resid_stats_ordered |> head(1)
## # A tibble: 1 × 5
##    year     me   mae prop_days_warmer prop_days_colder
##   <int>  <dbl> <dbl>            <dbl>            <dbl>
## 1  2017 -0.188  2.35           0.0274           0.0137
resid_stats_ordered |> tail(1)
## # A tibble: 1 × 5
##    year    me   mae prop_days_warmer prop_days_colder
##   <int> <dbl> <dbl>            <dbl>            <dbl>
## 1  1985 -1.20  3.55           0.0493            0.145
least_deviation_yr <- resid_stats_ordered |> head(1) |> pull(year)
most_deviation_yr <- resid_stats_ordered |> tail(1) |> pull(year)

least_most_plt <- data.frame(year = c(least_deviation_yr, most_deviation_yr), label = c("least deviation", "most deviation")) |>
    inner_join(meas_fit2, by = 'year') |>
    mutate(label = paste0(year, " (", label, ")"))

ggplot(least_most_plt, aes(day, temp, color = label)) +
        geom_point(alpha = 0.25) +
        geom_line(aes(day, pred))

ggplot(least_most_plt, aes(day, temp - pred, color = label)) +
        geom_point(alpha = 0.25) +
        geom_hline(yintercept = 0, linetype = "dashed")